Grocery Retailer
A large national grocery retailer tracks productivity and cost of its facilities closely. The data in the grocery.txt file were obtained from a single distribution center for a one year period. Each data point for each variable represents one week of activity. The variables included are the number of cases shipped (\(X_1\)), the indirect costs of the total labor hours as a percentage (\(X_2\)), a qualitative predictor called holiday that is called in one of the week has a holy day and zero otherwise (\(X_3\)), and the total labor hours (\(Y\)).
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
library(faraway)
grocery <- read.table("grocery.txt", header=FALSE)
names(grocery) <- c("Y", "X1", "X2", "X3")
grocery.full = lm(Y~., data=grocery)
summary(grocery.full)
##
## Call:
## lm(formula = Y ~ ., data = grocery)
##
## Residuals:
## Min 1Q Median 3Q Max
## -264.05 -110.73 -22.52 79.29 295.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.150e+03 1.956e+02 21.220 < 2e-16 ***
## X1 7.871e-04 3.646e-04 2.159 0.0359 *
## X2 -1.317e+01 2.309e+01 -0.570 0.5712
## X3 6.236e+02 6.264e+01 9.954 2.94e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 143.3 on 48 degrees of freedom
## Multiple R-squared: 0.6883, Adjusted R-squared: 0.6689
## F-statistic: 35.34 on 3 and 48 DF, p-value: 3.316e-12
Check the constant variance assumption.
plot(grocery.full, which=1)
plot(grocery.full$fitted.values, grocery.full$residuals)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(grocery.full)
##
## studentized Breusch-Pagan test
##
## data: grocery.full
## BP = 5.2504, df = 3, p-value = 0.1544
Check the normality assumption.
plot(grocery.full, which=2)
hist(grocery.full$residuals)
shapiro.test(residuals(grocery.full))
##
## Shapiro-Wilk normality test
##
## data: residuals(grocery.full)
## W = 0.97575, p-value = 0.3644
Check for the structure of the relationship between the predictors and the response.
The Partial Regression or Added Variable plots are used to check the model structure \(E(\mathbf{y})=\mathbf{X}\beta\).
grocery.full = lm(Y~X2+X3, data=grocery)
grocery.mlr.X1 = lm(X1 ~ X2+X3, data=grocery )
plot(grocery.mlr.X1$residuals, grocery.full$residuals)
grocery.full = lm(Y~X1+X3, data=grocery)
grocery.mlr.X2 = lm(X2 ~ X1+X3, data=grocery )
plot(grocery.mlr.X2$residuals, grocery.full$residuals)
grocery.full = lm(Y~X2+X1, data=grocery)
grocery.mlr.X3 = lm(X3 ~ X2+X1, data=grocery )
plot(grocery.mlr.X3$residuals, grocery.full$residuals)
Use the teengamb data from the faraway library to fit a model with gamble as the response and the other variables as predictors.
library(ggplot2)
library(faraway)
data(teengamb)
names(teengamb)
## [1] "sex" "status" "income" "verbal" "gamble"
attach(teengamb)
teengamb.full = lm(gamble~., data=teengamb)
summary(teengamb.full)
##
## Call:
## lm(formula = gamble ~ ., data = teengamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.082 -11.320 -1.451 9.452 94.252
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.55565 17.19680 1.312 0.1968
## sex -22.11833 8.21111 -2.694 0.0101 *
## status 0.05223 0.28111 0.186 0.8535
## income 4.96198 1.02539 4.839 1.79e-05 ***
## verbal -2.95949 2.17215 -1.362 0.1803
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared: 0.5267, Adjusted R-squared: 0.4816
## F-statistic: 11.69 on 4 and 42 DF, p-value: 1.815e-06
Check the constant variance assumption.
plot(teengamb.full, which=1)
plot(teengamb.full$fitted.values, teengamb.full$residuals)
library(lmtest)
bptest(teengamb.full)
##
## studentized Breusch-Pagan test
##
## data: teengamb.full
## BP = 6.4288, df = 4, p-value = 0.1693
Check the normality assumption.
plot(teengamb.full, which=2)
hist(teengamb.full$residuals)
shapiro.test(residuals(teengamb.full))
##
## Shapiro-Wilk normality test
##
## data: residuals(teengamb.full)
## W = 0.86839, p-value = 8.16e-05
Check for the structure of the relationship between the predictors and the response.
teengamb.full = lm(gamble ~ income + verbal + sex, data=teengamb)
teengamb.mlr.X1 = lm(status ~ income + verbal + sex, data=teengamb )
plot(teengamb.mlr.X1$residuals, teengamb.full$residuals)
teengamb.full = lm(gamble~ status + income + sex, data=teengamb)
teengamb.mlr.X2 = lm(verbal ~ status + income + sex, data=teengamb )
plot(teengamb.mlr.X2$residuals, teengamb.full$residuals)
teengamb.full = lm(gamble ~ status + income + verbal, data=teengamb)
teengamb.mlr.X3 = lm(income ~ status + sex + verbal, data=teengamb )
plot(teengamb.mlr.X3$residuals, teengamb.full$residuals)
Consider the prostate data from the faraway library. Fit a model with lpsa as the response and the other variables as predictors.
library(faraway)
data(prostate)
names(prostate)
## [1] "lcavol" "lweight" "age" "lbph" "svi" "lcp" "gleason"
## [8] "pgg45" "lpsa"
attach(prostate)
prostate.full = lm(lpsa~., data=prostate)
summary(prostate.full)
##
## Call:
## lm(formula = lpsa ~ ., data = prostate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7331 -0.3713 -0.0170 0.4141 1.6381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.669337 1.296387 0.516 0.60693
## lcavol 0.587022 0.087920 6.677 2.11e-09 ***
## lweight 0.454467 0.170012 2.673 0.00896 **
## age -0.019637 0.011173 -1.758 0.08229 .
## lbph 0.107054 0.058449 1.832 0.07040 .
## svi 0.766157 0.244309 3.136 0.00233 **
## lcp -0.105474 0.091013 -1.159 0.24964
## gleason 0.045142 0.157465 0.287 0.77503
## pgg45 0.004525 0.004421 1.024 0.30886
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7084 on 88 degrees of freedom
## Multiple R-squared: 0.6548, Adjusted R-squared: 0.6234
## F-statistic: 20.86 on 8 and 88 DF, p-value: < 2.2e-16
Compute and comment on the condition numbers.
x = model.matrix(prostate.full)[,-1] # remove the column that corresponds to the intercept
e = eigen(t(x) %*% x) # compute the eigenvalues
e$val
## [1] 4.790826e+05 6.190704e+04 2.109042e+02 1.756329e+02 6.479853e+01
## [6] 4.452379e+01 2.023914e+01 8.093145e+00
sqrt(e$val[1]/e$val[8])
## [1] 243.3025
Compute and comment on the correlations between predictors.
cor(prostate)
## lcavol lweight age lbph svi lcp
## lcavol 1.00000000 0.194128387 0.2249999 0.02734971 0.53884500 0.67531058
## lweight 0.19412839 1.000000000 0.3075247 0.43493174 0.10877818 0.10023889
## age 0.22499988 0.307524741 1.0000000 0.35018592 0.11765804 0.12766778
## lbph 0.02734971 0.434931744 0.3501859 1.00000000 -0.08584327 -0.00699944
## svi 0.53884500 0.108778185 0.1176580 -0.08584327 1.00000000 0.67311122
## lcp 0.67531058 0.100238891 0.1276678 -0.00699944 0.67311122 1.00000000
## gleason 0.43241705 -0.001283003 0.2688916 0.07782044 0.32041222 0.51482991
## pgg45 0.43365224 0.050846195 0.2761124 0.07846000 0.45764762 0.63152807
## lpsa 0.73446028 0.354121818 0.1695929 0.17980950 0.56621818 0.54881316
## gleason pgg45 lpsa
## lcavol 0.432417052 0.4336522 0.7344603
## lweight -0.001283003 0.0508462 0.3541218
## age 0.268891599 0.2761124 0.1695929
## lbph 0.077820444 0.0784600 0.1798095
## svi 0.320412221 0.4576476 0.5662182
## lcp 0.514829912 0.6315281 0.5488132
## gleason 1.000000000 0.7519045 0.3689867
## pgg45 0.751904512 1.0000000 0.4223157
## lpsa 0.368986693 0.4223157 1.0000000
Compute the variance inflation factors.
round(vif(x), dig=2)
## lcavol lweight age lbph svi lcp gleason pgg45
## 2.05 1.36 1.32 1.38 1.96 3.10 2.47 2.97
Consider the cheddar data from the faraway library. Fit a model with taste as the response and the other variables as predictors.
library(faraway)
data(cheddar)
names(cheddar)
## [1] "taste" "Acetic" "H2S" "Lactic"
head(cheddar)
## taste Acetic H2S Lactic
## 1 12.3 4.543 3.135 0.86
## 2 20.9 5.159 5.043 1.53
## 3 39.0 5.366 5.438 1.57
## 4 47.9 5.759 7.496 1.81
## 5 5.6 4.663 3.807 0.99
## 6 25.9 5.697 7.601 1.09
attach(cheddar)
mod1ch<-lm(taste~.,data=cheddar)
summary(mod1ch)
##
## Call:
## lm(formula = taste ~ ., data = cheddar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.390 -6.612 -1.009 4.908 25.449
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -28.8768 19.7354 -1.463 0.15540
## Acetic 0.3277 4.4598 0.073 0.94198
## H2S 3.9118 1.2484 3.133 0.00425 **
## Lactic 19.6705 8.6291 2.280 0.03108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.13 on 26 degrees of freedom
## Multiple R-squared: 0.6518, Adjusted R-squared: 0.6116
## F-statistic: 16.22 on 3 and 26 DF, p-value: 3.81e-06
Check the model assumptions.
par(mfrow=c(2,2))
plot(mod1ch)
Is any transformation of the predictors suggested?
plot(cheddar$Acetic,residuals(mod1ch)); abline(c(0,0))
plot(cheddar$H2S,residuals(mod1ch)); abline(c(0,0))
plot(cheddar$Lactic,residuals(mod1ch)); abline(c(0,0))
No trends are observed in the three plots. The plots suggest there is no need for a transformation in the predictors.
Use the Box-Cox method to determine an optimal transformation of the response. Would it be reasonable to leave the response untransformed?
library(MASS)
## Warning: package 'MASS' was built under R version 3.6.2
sr.trans=boxcox(mod1ch, lambda=seq(-2, 2, length=400))
sr.trans$x[sr.trans$y == max(sr.trans$y)]
## [1] 0.6666667
tmp=sr.trans$x[sr.trans$y > max(sr.trans$y) - qchisq(0.95, 1)/2]
range(tmp)
## [1] 0.4060150 0.9674185
The value of lambda that maximizes the log-lilehood is lambda-hat=0.666. Since lambda=0.5 lies in the 95% Confidence Interval, the square root transformation can be selected as an optimal transformation. Since lambda=1 is not included in the CI, we should apply a transformation to the response.
Use the optimal transformation of the response and refit the additive model. Does this make any difference to the transformations suggested for the predictors?
mod2ch<-lm(sqrt(taste)~.,data=cheddar)
summary(mod2ch)
##
## Call:
## lm(formula = sqrt(taste) ~ ., data = cheddar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.50082 -0.54933 0.04872 0.81119 2.26363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.9237645 2.2562657 -0.409 0.68558
## Acetic -0.0004978 0.5098648 -0.001 0.99923
## H2S 0.4449854 0.1427277 3.118 0.00441 **
## Lactic 2.0184911 0.9865228 2.046 0.05099 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.158 on 26 degrees of freedom
## Multiple R-squared: 0.6266, Adjusted R-squared: 0.5835
## F-statistic: 14.54 on 3 and 26 DF, p-value: 9.277e-06
plot(cheddar$Acetic,residuals(mod2ch));abline(c(0,0))
plot(cheddar$H2S,residuals(mod2ch));abline(c(0,0))
plot(cheddar$Lactic,residuals(mod2ch));abline(c(0,0))
If \(n=p\) and the \(\mathbf{X}\) matrix is invertible, show that the hat matrix \(\mathbf{H}\) is given by the \(p\times p\) identity matrix. In this case, what are \(h_{ii}\) and \(\hat{Y}_{i}\)?
If \(n=p\), then \(H\) is a square matrix, so we can write \[H = X(X^TX)^{-1}X^T = X( X^{-1} (X^T)^{-1} ) X^T = (X X^{-1}) ((X^T)^{-1} ) X^T) = I\] In this case, \(h_{ii}=1\) and \(\hat{Y} = H Y = Y\), which implies that \(\hat{Y}_{i}=Y_i\).
The whitewines.csv data set contains information related to white variants of the Portuguese “Vinho Verde” wine. Specifically, we have recorded the following information:
In this homework, our goal is to explain the relationship between alcohol level (dependent variable) and residual sugar, pH, density and fixed acidity.
whitewines.all = read.csv("whitewines.csv", sep = ";")
whitewines = whitewines.all[,c("alcohol","residual.sugar","pH","density","fixed.acidity")]
full_wine = lm(alcohol ~ residual.sugar + pH + density + fixed.acidity, data = whitewines)
Check the constant variance assumption.
plot(full_wine, which=1)
Based on the plot alone, we observe that the constant variance assumption is violated. In fact, there observations 2782, 1527 and 3902 are outliers, as we have observed in the previous homework.We also perform the Breusch-Pagan test. Under the null hypothesis of this test there is homoscedasticity.
library(lmtest)
bptest(full_wine)
##
## studentized Breusch-Pagan test
##
## data: full_wine
## BP = 250.65, df = 4, p-value < 2.2e-16
We reject the null hypothesis and conclude that the variance is not constant. This conclusion is similar to our conclusion based on the graphical test.
Check the normality assumption.
plot(full_wine, which=2)
hist(full_wine$residuals)
The distribution is right skewed, so we reject the normality assumptions as well. This is also probably true due to the presence of outliers in the data. If we perform a nomrality test, we corroborate the answer we got from the graphical test.
ks.test(residuals(full_wine), y=pnorm)
## Warning in ks.test(residuals(full_wine), y = pnorm): ties should not be present
## for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(full_wine)
## D = 0.23173, p-value < 2.2e-16
## alternative hypothesis: two-sided
Check for the structure of the relationship between the predictors and the response.
Our goal here is to check if the linearity assumption holds. So, we create added-variables plots:
# Full model
full_wine = lm(alcohol ~ residual.sugar + pH + density + fixed.acidity, data = whitewines)
# residual.sugar removed
y_sugar = update(full_wine, ~ pH + density + fixed.acidity)$res
x_sugar = lm(residual.sugar ~ pH + density + fixed.acidity, data= whitewines)$res
model1 = lm(y_sugar ~ x_sugar)
# pH removed
y_pH = update(full_wine, ~ residual.sugar + density + fixed.acidity)$res
x_pH = lm(pH ~ residual.sugar + density + fixed.acidity, data=whitewines)$res
model2 = lm(y_pH ~ x_pH)
# density removed
y_density = update(full_wine, ~ residual.sugar + pH + fixed.acidity)$res
x_density = lm(density ~ residual.sugar + pH + fixed.acidity, data=whitewines)$res
model3 = lm(y_density ~ x_density)
# fixed.acidity removed
y_acidity = update(full_wine, ~ residual.sugar + pH + density)$res
x_acidity = lm(fixed.acidity ~ residual.sugar + pH + density, data=whitewines)$res
model4 = lm(y_acidity ~ x_acidity)
plot(x_sugar, y_sugar, xlab="Residual sugar residual s", ylab="Alcohol level")
abline(model1)
plot(x_pH, y_pH, xlab="pH residuals", ylab="Alcohol level")
abline(model2)
plot(x_density, y_density, xlab="Density residuals", ylab="Alcohol lev el")
abline(model3)
plot(x_acidity, y_acidity, xlab="Fixed Acidity residuals", ylab="Alcohol level")
abline(model4)
In all plots, the points are evenly scattered about the line, which leads us to conclude that the linearity assumption is not violated. Note here, that a line with a positive/negative slope does not make us reject the linearity hypothesis.
Is any transformation of the predictors suggested?
No, since we found no evidence that the linearity assumption is violated.Use the Box-Cox method to determine an optimal transformation of the response. Would it be reasonable to leave the response untransformed?
whitewines[whitewines$alcohol <= 0,]
## [1] alcohol residual.sugar pH density fixed.acidity
## <0 rows> (or 0-length row.names)
library(MASS)
wines.boxcox = boxcox(full_wine, lambda=seq(-2, 2, length=400))
lambda = wines.boxcox$x[wines.boxcox$y == max(wines.boxcox$y)]
lambda
## [1] -0.2857143
The optimal lambda is -0.29 approximately, with bounds that do not contain 1 (when 1 is in the interval, then this suggests that no transformation is needed). So, here, we can say that the Box-Cox suggest raising the response to a power equal to -0.29.
In the lectures, we discussed that for convenience in the interpretation, one can choose to round the lambda to the closest 0.5 multiple. So, here we could choose to transform the variable with a lambda equal to -0.5 instead, or even with the log transform.Use the optimal transformation of the response and refit the additive model. Does this make any difference to the transformations suggested for the predictors?
In the code below, we choose to work with the optimal lambda:
(In the hw, if you chose to proceed with -0.5 or log, no points will be deducted.)
# Transformation of the response
whitewines$alcohol.new = ((whitewines$alcohol)^lambda-1)/lambda
wines.transform = lm(alcohol.new ~ residual.sugar + pH + density + fixed.acidity, data = whitewines)
summary(wines.transform)
##
## Call:
## lm(formula = alcohol.new ~ residual.sugar + pH + density + fixed.acidity,
## data = whitewines)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16219 -0.01280 -0.00058 0.01047 0.81170
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.311e+01 2.168e-01 152.67 <2e-16 ***
## residual.sugar 1.087e-02 1.291e-04 84.22 <2e-16 ***
## pH 1.240e-01 2.522e-03 49.14 <2e-16 ***
## density -3.223e+01 2.228e-01 -144.70 <2e-16 ***
## fixed.acidity 2.600e-02 4.709e-04 55.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02246 on 4893 degrees of freedom
## Multiple R-squared: 0.8531, Adjusted R-squared: 0.853
## F-statistic: 7106 on 4 and 4893 DF, p-value: < 2.2e-16
After fitting the transformed model, we re-check the model assumptions.
plot(wines.transform, which=1)
bptest(wines.transform)
##
## studentized Breusch-Pagan test
##
## data: wines.transform
## BP = 266.48, df = 4, p-value < 2.2e-16
ks.test(residuals(wines.transform), y=pnorm)
## Warning in ks.test(residuals(wines.transform), y = pnorm): ties should not be
## present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(wines.transform)
## D = 0.47769, p-value < 2.2e-16
## alternative hypothesis: two-sided
plot(wines.transform, which=2)
As we can see, variance and normality (specially) normality look slightly better, but they are still violated, probably due to the presence of outliers.
In practice (this is not required in this hw), we would proceed by removing the outliers (before any transformations) that we identified in this and the previous homework, and then we would re-fit the model. After refitting the model, we would check the model assumptions. IF you do this process, you will find out that the assumptions are still violated. So, you would again check for a suitable transformation of the response, re-fit the transformed model without the outliers and then check one final time the model assumptions. It might be the case that the assumptions would no longer be violated, but this is not necessarily the case. In fact, in this data set, you will find out that although the residuals will look slightly better, the normality and variance assumptions will still be violated.
So, the question is why? Well, that depends on the problem. Here, we have a very large dataset and we attempt to explain the variation in the response with only 4 variables. It might be the case that if we incorporated more variables in the model, then we would remedy most (if not all) of the departures from the model assumptions.